Let’s load some packages
suppressPackageStartupMessages({
library('data.table')
library('dplyr')
library('ggplot2')
library('cowplot')
})
Some functions that we will find helpful:
This function loads eXpress results with all parameters (default, --fr-stranded, --rf-stranded), given a sample and the column in the quantification file.
load_column <- function(sample_name, column_name) {
results_path <- file.path(base_dir, 'results', sample_name,
c('paper', 'paper_forward', 'paper_reverse'),
'express', 'results.xprs')
res <- lapply(results_path, read.table, header = TRUE, stringsAsFactors = FALSE)
res <- lapply(res, function(x) x[, c('target_id', column_name)])
res <- Map(function(df, new_name) {
data.table::setnames(df, column_name, new_name)
df
}, res, paste0(c('default', 'forward', 'reverse'), '_', column_name))
Reduce(function(x, y) dplyr::inner_join(x, y, by = 'target_id'), res)
}
This function loads a specific type of eXpress results across all samples:
load_all_samples <- function(which_mode, column_name) {
which_path <- switch(which_mode,
default = 'paper',
forward = 'paper_forward',
reverse = 'paper_reverse')
sample_names <- c('ENCLB037ZZZ', 'ENCLB038ZZZ', 'ENCLB055ZZZ', 'ENCLB056ZZZ')
results_path <- file.path(base_dir, 'results', sample_names, which_path,
'express', 'results.xprs')
res <- lapply(results_path, read.table, header = TRUE, stringsAsFactors = FALSE)
res <- lapply(res, function(x) x[, c('target_id', column_name)])
res <- Map(function(df, new_name) {
data.table::setnames(df, column_name, new_name)
df
}, res, sample_names)
res <- Reduce(function(x, y) dplyr::inner_join(x, y, by = 'target_id'), res)
res <- as.data.frame(res)
rownames(res) <- res$target_id
res$target_id <- NULL
res
}
base_dir <- '..'
current_sample <- 'ENCLB037ZZZ'
current_sample_tpm <- load_column(current_sample, 'tpm')
Let’s look at running the forward and reverse version on the first data set:
with(current_sample_tpm, cor(forward_tpm, reverse_tpm))
## [1] 0.1889952
The plot doesn’t look so great either:
ggplot(current_sample_tpm, aes(log(forward_tpm + 1), log(reverse_tpm + 1))) +
geom_point(alpha = 0.4)
Note: Here we will be comparing TPMs. It is not necessarily completely correct since the distribution is probably slightly different, but since they are replicates it should be more or less okay.
First let’s look at how many reads mapped:
reverse_counts <- load_all_samples('reverse', 'est_counts')
forward_counts <- load_all_samples('forward', 'est_counts')
very few reads map if you do not specify the correct strand (1.4-1.8M versus 60-75M reads)
sapply(forward_counts, sum)
## ENCLB037ZZZ ENCLB038ZZZ ENCLB055ZZZ ENCLB056ZZZ
## 1819642 1591383 1384936 1720570
sapply(reverse_counts, sum)
## ENCLB037ZZZ ENCLB038ZZZ ENCLB055ZZZ ENCLB056ZZZ
## 64750369 64927498 60580662 74232876
Let’s get the ordering in the original FASTA:
fasta_ordering <- system("grep '>' ../index/gencode.v16.pc_transcripts.fa | sed 's/^>//g'",
intern = TRUE)
reverse_tpm <- load_all_samples('reverse', 'tpm')
reverse_tpm <- reverse_tpm[fasta_ordering, ]
forward_tpm <- load_all_samples('forward', 'tpm')
forward_tpm <- forward_tpm[fasta_ordering, ]
forward_plot <- ggplot(forward_tpm, aes(log(ENCLB037ZZZ + 1), log(ENCLB038ZZZ + 1))) +
geom_point(alpha = 0.4) +
ggtitle('running express with forward strand')
reverse_plot <- ggplot(reverse_tpm, aes(log(ENCLB037ZZZ + 1), log(ENCLB038ZZZ + 1))) +
geom_point(alpha = 0.4) +
ggtitle('running express with reverse strand')
Notice the correlation is not so great using the forward strand:
forward_plot
It improves greatly if you use the reverse strand:
reverse_plot
Correlations:
with(forward_tpm, cor(ENCLB037ZZZ, ENCLB038ZZZ))
## [1] 0.9509102
with(forward_tpm, cor(ENCLB037ZZZ, ENCLB038ZZZ, method = 'spearman'))
## [1] 0.7175198
with(reverse_tpm, cor(ENCLB037ZZZ, ENCLB038ZZZ))
## [1] 0.9872897
with(reverse_tpm, cor(ENCLB037ZZZ, ENCLB038ZZZ, method = 'spearman'))
## [1] 0.8958683
It is unclear what the unit is on the website. Let’s figure it out:
tmp <- read.table('http://rafalab.rc.fas.harvard.edu/rnaseqcomp/encodeexample.txt',
header = TRUE)
It appears to be TPM:
sapply(tmp, sum)
## ENCLB037ZZZ ENCLB038ZZZ ENCLB055ZZZ ENCLB056ZZZ
## 1000000.7 999999.8 1000000.5 1000000.4
Write the reverse results out for submission to http://rafalab.rc.fas.harvard.edu/rnaseqbenchmark:
write.table(reverse_tpm, file = '../results/reverse_merged.tpm',
quote = FALSE, sep = '\t', row.names = FALSE, col.names = TRUE)
sessionInfo()
## R version 3.3.0 (2016-05-03)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 14.04.4 LTS
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] utils stats methods base
##
## other attached packages:
## [1] cowplot_0.6.2 ggplot2_2.1.0 dplyr_0.4.3 data.table_1.9.6
## [5] knitr_1.13
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.5 grDevices_3.3.0 digest_0.6.9 assertthat_0.1
## [5] plyr_1.8.3 grid_3.3.0 chron_2.3-47 R6_2.1.2
## [9] gtable_0.2.0 DBI_0.4-1 magrittr_1.5 scales_0.4.0
## [13] evaluate_0.9 stringi_1.1.1 graphics_3.3.0 rmarkdown_0.9.6
## [17] labeling_0.3 tools_3.3.0 stringr_1.0.0 munsell_0.4.3
## [21] yaml_2.1.13 parallel_3.3.0 colorspace_1.2-6 htmltools_0.3.5